Dream Housing Finance company deals in all kinds of home loans. They have presence across all urban, semi urban and rural areas. Customer first applies for home loan and after that company validates the customer eligibility for loan. Company wants to automate the loan eligibility process (real time) based on customer detail provided while filling online application form. These details are Gender, Marital Status, Education, Number of Dependents, Income, Loan Amount, Credit History and others. To automate this process, they have provided a dataset to identify the customers segments that are eligible for loan amount so that they can specifically target these customers.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from collections import Counter
import math
from sklearn.impute import SimpleImputer
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from pandas_profiling import ProfileReport
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,ConfusionMatrixDisplay,confusion_matrix
from sklearn import tree
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.tree import plot_tree
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import balanced_accuracy_score, roc_auc_score,roc_curve
from matplotlib.pyplot import subplots, cm
from ISLP.svm import plot as plot_svm
import graphviz
C:\Users\HP\AppData\Local\Temp\ipykernel_12436\1836245256.py:12: DeprecationWarning: `import pandas_profiling` is going to be deprecated by April 1st. Please use `import ydata_profiling` instead. from pandas_profiling import ProfileReport
Bootcamp=pd.read_csv('Loan_Data.csv')
Bootcamp.shape
(614, 13)
Bootcamp.head(5)
| Loan_ID | Gender | Married | Dependents | Education | Self_Employed | ApplicantIncome | CoapplicantIncome | LoanAmount | Loan_Amount_Term | Credit_History | Property_Area | Loan_Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | LP001002 | Male | No | 0 | Graduate | No | 5849 | 0.0 | NaN | 360.0 | 1.0 | Urban | Y |
| 1 | LP001003 | Male | Yes | 1 | Graduate | No | 4583 | 1508.0 | 128.0 | 360.0 | 1.0 | Rural | N |
| 2 | LP001005 | Male | Yes | 0 | Graduate | Yes | 3000 | 0.0 | 66.0 | 360.0 | 1.0 | Urban | Y |
| 3 | LP001006 | Male | Yes | 0 | Not Graduate | No | 2583 | 2358.0 | 120.0 | 360.0 | 1.0 | Urban | Y |
| 4 | LP001008 | Male | No | 0 | Graduate | No | 6000 | 0.0 | 141.0 | 360.0 | 1.0 | Urban | Y |
Bootcamp['Loan_ID'].duplicated().sum()
0
Bootcamp.tail(10)
| Loan_ID | Gender | Married | Dependents | Education | Self_Employed | ApplicantIncome | CoapplicantIncome | LoanAmount | Loan_Amount_Term | Credit_History | Property_Area | Loan_Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 604 | LP002959 | Female | Yes | 1 | Graduate | No | 12000 | 0.0 | 496.0 | 360.0 | 1.0 | Semiurban | Y |
| 605 | LP002960 | Male | Yes | 0 | Not Graduate | No | 2400 | 3800.0 | NaN | 180.0 | 1.0 | Urban | N |
| 606 | LP002961 | Male | Yes | 1 | Graduate | No | 3400 | 2500.0 | 173.0 | 360.0 | 1.0 | Semiurban | Y |
| 607 | LP002964 | Male | Yes | 2 | Not Graduate | No | 3987 | 1411.0 | 157.0 | 360.0 | 1.0 | Rural | Y |
| 608 | LP002974 | Male | Yes | 0 | Graduate | No | 3232 | 1950.0 | 108.0 | 360.0 | 1.0 | Rural | Y |
| 609 | LP002978 | Female | No | 0 | Graduate | No | 2900 | 0.0 | 71.0 | 360.0 | 1.0 | Rural | Y |
| 610 | LP002979 | Male | Yes | 3+ | Graduate | No | 4106 | 0.0 | 40.0 | 180.0 | 1.0 | Rural | Y |
| 611 | LP002983 | Male | Yes | 1 | Graduate | No | 8072 | 240.0 | 253.0 | 360.0 | 1.0 | Urban | Y |
| 612 | LP002984 | Male | Yes | 2 | Graduate | No | 7583 | 0.0 | 187.0 | 360.0 | 1.0 | Urban | Y |
| 613 | LP002990 | Female | No | 0 | Graduate | Yes | 4583 | 0.0 | 133.0 | 360.0 | 0.0 | Semiurban | N |
ProfileReport(Bootcamp)
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Bootcamp.isnull().sum()
Loan_ID 0 Gender 13 Married 3 Dependents 15 Education 0 Self_Employed 32 ApplicantIncome 0 CoapplicantIncome 0 LoanAmount 22 Loan_Amount_Term 14 Credit_History 50 Property_Area 0 Loan_Status 0 dtype: int64
sns.heatmap(Bootcamp.isna().corr(),cmap='YlGnBu',annot=True,annot_kws={"size":6})
<AxesSubplot:>
sns.heatmap(Bootcamp.corr(),cmap='YlGnBu',annot=True,annot_kws={"size":6})
<AxesSubplot:>
x=Bootcamp.drop(["Credit_History","LoanAmount","Loan_Amount_Term"],axis=1)
x.dropna(axis=0,inplace=True)
for i in x.columns:
H=Bootcamp[i]
H=np.array(H)
Counter(H)
J=[]
entropy=0
for j in Counter(H):
p=Counter(H)[j]
J==J.append(p)
for k in J:
a_k=-(k/len(H))*(math.log(k/len(H),len(J)))
entropy=entropy+a_k
print (f"The entropy for {i} is {entropy}")
The entropy for Loan_ID is 1.0000000000000002 The entropy for Gender is 0.521824157615196 The entropy for Married is 0.6137709299053016 The entropy for Dependents is 0.7557576132142594 The entropy for Education is 0.75694266717241 The entropy for Self_Employed is 0.5371275860545196 The entropy for ApplicantIncome is 0.9811517564817506 The entropy for CoapplicantIncome is 0.6685261046939365 The entropy for Property_Area is 0.9946999272825048 The entropy for Loan_Status is 0.8962696698442932
hist = plt.hist(x=Bootcamp['ApplicantIncome'],color='yellow', density = True)
plt.axvline(Bootcamp['ApplicantIncome'].mean(), color = 'red', linestyle='--')
plt.axvline(Bootcamp['ApplicantIncome'].median(), color = 'black', linestyle='-')
<matplotlib.lines.Line2D at 0x2440dc778b0>
hist = plt.hist(x=Bootcamp['CoapplicantIncome'],color='green', density = True)
plt.axvline(Bootcamp['CoapplicantIncome'].mean(), color = 'red', linestyle='--')
plt.axvline(Bootcamp['CoapplicantIncome'].median(), color = 'black', linestyle='-')
<matplotlib.lines.Line2D at 0x2440dc3e550>
hist = plt.hist(x=Bootcamp['LoanAmount'],color='blue', density = True)
plt.axvline(Bootcamp['LoanAmount'].mean(), color = 'red', linestyle='--')
plt.axvline(Bootcamp['LoanAmount'].median(), color = 'black', linestyle='-')
<matplotlib.lines.Line2D at 0x2440dc58820>
sns.kdeplot(x='Credit_History',data=Bootcamp)
<AxesSubplot:xlabel='Credit_History', ylabel='Density'>
sns.kdeplot(x='LoanAmount',data=Bootcamp)
<AxesSubplot:xlabel='LoanAmount', ylabel='Density'>
sns.kdeplot(x='Loan_Amount_Term',data=Bootcamp)
<AxesSubplot:xlabel='Loan_Amount_Term', ylabel='Density'>
g=sns.FacetGrid(Bootcamp,col='Gender')
g.map(plt.hist,"Loan_Status")
<seaborn.axisgrid.FacetGrid at 0x2440c4ae430>
sns.countplot(x='Gender',data=Bootcamp,hue='Loan_Status')
<AxesSubplot:xlabel='Gender', ylabel='count'>
g=sns.FacetGrid(Bootcamp,col='Credit_History')
g.map(plt.hist,"Loan_Status")
<seaborn.axisgrid.FacetGrid at 0x2440db530d0>
sns.countplot(x='Credit_History',data=Bootcamp,hue='Loan_Status')
<AxesSubplot:xlabel='Credit_History', ylabel='count'>
g=sns.FacetGrid(Bootcamp,col='Property_Area')
g.map(plt.hist,"Loan_Status")
<seaborn.axisgrid.FacetGrid at 0x2440b57f130>
sns.countplot(x='Property_Area',data=Bootcamp,hue='Loan_Status')
<AxesSubplot:xlabel='Property_Area', ylabel='count'>
g=sns.FacetGrid(Bootcamp,col='Self_Employed')
g.map(plt.hist,"Loan_Status")
<seaborn.axisgrid.FacetGrid at 0x2440c325a00>
sns.countplot(x='Self_Employed',data=Bootcamp,hue='Loan_Status')
<AxesSubplot:xlabel='Self_Employed', ylabel='count'>
g=sns.FacetGrid(Bootcamp,col='Education')
g.map(plt.hist,"Loan_Status")
<seaborn.axisgrid.FacetGrid at 0x2440ba68910>
sns.countplot(x='Education',data=Bootcamp,hue='Loan_Status')
<AxesSubplot:xlabel='Education', ylabel='count'>
g=sns.FacetGrid(Bootcamp,col='Married')
g.map(plt.hist,"Loan_Status")
<seaborn.axisgrid.FacetGrid at 0x2440882ed00>
sns.countplot(x='Married',data=Bootcamp,hue='Loan_Status')
<AxesSubplot:xlabel='Married', ylabel='count'>
g=sns.FacetGrid(Bootcamp,col='Dependents')
g.map(plt.hist,"Loan_Status")
<seaborn.axisgrid.FacetGrid at 0x244090d0160>
sns.countplot(x='Dependents',data=Bootcamp,hue='Loan_Status')
<AxesSubplot:xlabel='Dependents', ylabel='count'>
# Dropping Loan_ID since it won't help us in Calculation
Bootcamp.drop('Loan_ID',axis=1,inplace=True)
sns.kdeplot(x="ApplicantIncome",data=Bootcamp,hue="Loan_Status")
<AxesSubplot:xlabel='ApplicantIncome', ylabel='Density'>
sns.boxplot(x='Loan_Status',y='ApplicantIncome',data=Bootcamp)
<AxesSubplot:xlabel='Loan_Status', ylabel='ApplicantIncome'>
sns.kdeplot(x="CoapplicantIncome",data=Bootcamp,hue="Loan_Status")
<AxesSubplot:xlabel='CoapplicantIncome', ylabel='Density'>
sns.boxplot(x='Loan_Status',y='CoapplicantIncome',data=Bootcamp,hue="Loan_Status")
<AxesSubplot:xlabel='Loan_Status', ylabel='CoapplicantIncome'>
sns.kdeplot(x="LoanAmount",data=Bootcamp,hue="Loan_Status")
<AxesSubplot:xlabel='LoanAmount', ylabel='Density'>
sns.boxplot(x="Loan_Status",y="LoanAmount",data=Bootcamp)
<AxesSubplot:xlabel='Loan_Status', ylabel='LoanAmount'>
sns.kdeplot(x="Loan_Amount_Term",data=Bootcamp,hue="Loan_Status")
<AxesSubplot:xlabel='Loan_Amount_Term', ylabel='Density'>
sns.barplot(x="Married",y='CoapplicantIncome',data=Bootcamp,hue="Loan_Status")
<AxesSubplot:xlabel='Married', ylabel='CoapplicantIncome'>
sns.histplot(x='CoapplicantIncome',data=Bootcamp,hue="Married")
<AxesSubplot:xlabel='CoapplicantIncome', ylabel='Count'>
sns.barplot(x='Married',y="CoapplicantIncome",data=Bootcamp,hue='Credit_History')
<AxesSubplot:xlabel='Married', ylabel='CoapplicantIncome'>
"One can see the first of the three graph above that peole who are married have a higher amount of Average CoApplicant Income he-nce have a higher probability of Loan Approval"
# to check datatypes DataFrame.dtypes
Bootcamp.dtypes
Gender object Married object Dependents object Education object Self_Employed object ApplicantIncome int64 CoapplicantIncome float64 LoanAmount float64 Loan_Amount_Term float64 Credit_History float64 Property_Area object Loan_Status object dtype: object
Dumm=pd.get_dummies(Bootcamp[['Gender','Married','Dependents','Education','Self_Employed','Property_Area']])
Bootcamp=Dumm.join(Bootcamp)
# Dropping the Categorical Variables
Bootcamp.drop(['Gender','Married','Dependents','Education','Self_Employed','Property_Area'],axis=1,inplace=True)
Loan=SimpleImputer(strategy='most_frequent')
Credit_History=pd.DataFrame(Loan.fit_transform(Bootcamp[['Credit_History']]))
Bootcamp.drop('Credit_History',axis=1,inplace=True)
Bootcamp=Bootcamp.join(Credit_History)
Bootcamp=Bootcamp.rename(columns={0:"Credit_History"})
Bootcamp
| Gender_Female | Gender_Male | Married_No | Married_Yes | Dependents_0 | Dependents_1 | Dependents_2 | Dependents_3+ | Education_Graduate | Education_Not Graduate | ... | Self_Employed_Yes | Property_Area_Rural | Property_Area_Semiurban | Property_Area_Urban | ApplicantIncome | CoapplicantIncome | LoanAmount | Loan_Amount_Term | Loan_Status | Credit_History | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 5849 | 0.0 | NaN | 360.0 | Y | 1.0 |
| 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 4583 | 1508.0 | 128.0 | 360.0 | N | 1.0 |
| 2 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 1 | 3000 | 0.0 | 66.0 | 360.0 | Y | 1.0 |
| 3 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 1 | 2583 | 2358.0 | 120.0 | 360.0 | Y | 1.0 |
| 4 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 6000 | 0.0 | 141.0 | 360.0 | Y | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 609 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 2900 | 0.0 | 71.0 | 360.0 | Y | 1.0 |
| 610 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 4106 | 0.0 | 40.0 | 180.0 | Y | 1.0 |
| 611 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 8072 | 240.0 | 253.0 | 360.0 | Y | 1.0 |
| 612 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 7583 | 0.0 | 187.0 | 360.0 | Y | 1.0 |
| 613 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 1 | 0 | 4583 | 0.0 | 133.0 | 360.0 | N | 0.0 |
614 rows × 21 columns
Loan=pd.get_dummies(Bootcamp['Loan_Amount_Term'])
# Renaming Column Names of Loan_Amount_Term
for i in Loan.columns:
Loan=Loan.rename(columns={i:"Loan Term is " + str(i)})
Bootcamp=Bootcamp.join(Loan)
Bootcamp.drop('Loan_Amount_Term',axis=1,inplace=True)
Loan=SimpleImputer()
LoanAmount=pd.DataFrame(Loan.fit_transform(Bootcamp[['LoanAmount']]))
Bootcamp.drop('LoanAmount',axis=1,inplace=True)
Bootcamp=Bootcamp.join(LoanAmount)
Bootcamp=Bootcamp.rename(columns={0:"LoanAmount"})
Bootcamp.isna().sum()
Gender_Female 0 Gender_Male 0 Married_No 0 Married_Yes 0 Dependents_0 0 Dependents_1 0 Dependents_2 0 Dependents_3+ 0 Education_Graduate 0 Education_Not Graduate 0 Self_Employed_No 0 Self_Employed_Yes 0 Property_Area_Rural 0 Property_Area_Semiurban 0 Property_Area_Urban 0 ApplicantIncome 0 CoapplicantIncome 0 Loan_Status 0 Credit_History 0 Loan Term is 12.0 0 Loan Term is 36.0 0 Loan Term is 60.0 0 Loan Term is 84.0 0 Loan Term is 120.0 0 Loan Term is 180.0 0 Loan Term is 240.0 0 Loan Term is 300.0 0 Loan Term is 360.0 0 Loan Term is 480.0 0 LoanAmount 0 dtype: int64
X=Bootcamp.drop("Loan_Status",axis=1)
Bootcamp["Loan_Status"]
0 Y
1 N
2 Y
3 Y
4 Y
..
609 Y
610 Y
611 Y
612 Y
613 N
Name: Loan_Status, Length: 614, dtype: object
Bootcamp
| Gender_Female | Gender_Male | Married_No | Married_Yes | Dependents_0 | Dependents_1 | Dependents_2 | Dependents_3+ | Education_Graduate | Education_Not Graduate | ... | Loan Term is 36.0 | Loan Term is 60.0 | Loan Term is 84.0 | Loan Term is 120.0 | Loan Term is 180.0 | Loan Term is 240.0 | Loan Term is 300.0 | Loan Term is 360.0 | Loan Term is 480.0 | LoanAmount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 146.412162 |
| 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 128.000000 |
| 2 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 66.000000 |
| 3 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 120.000000 |
| 4 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 141.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 609 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 71.000000 |
| 610 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 40.000000 |
| 611 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 253.000000 |
| 612 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 187.000000 |
| 613 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 133.000000 |
614 rows × 30 columns
X=Bootcamp.drop("Loan_Status",axis=1)
y=Bootcamp['Loan_Status']
class_mapping = {'N': 0, 'Y': 1}
y = np.array([class_mapping[label] for label in y])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
sns.countplot(y_train)
<AxesSubplot:ylabel='count'>
Loan=SMOTE(k_neighbors=5)
X_train,y_train=Loan.fit_resample(X_train,y_train)
sns.histplot(y_train)
<AxesSubplot:ylabel='Count'>
#Defining our learning rate . Here l1 means lasso while l2 means ridge regression
param_grid={"penalty":["l1","l2"],"C":[0.1,1,10]}
#Doing our Grid Search
logit=GridSearchCV(LogisticRegression(),param_grid,cv=5,scoring="accuracy")
#Fitting our Grid Search model with Training Data Set
logit.fit(X_train,y_train)
GridSearchCV(cv=5, estimator=LogisticRegression(),
param_grid={'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']},
scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5, estimator=LogisticRegression(),
param_grid={'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']},
scoring='accuracy')LogisticRegression()
LogisticRegression()
#Finding our best parameter
logit.best_params_
{'C': 10, 'penalty': 'l2'}
#Creating our Logistic Regression Model with the best parameters
FinalLog=LogisticRegression(penalty=logit.best_params_["penalty"],C=logit.best_params_["C"])
#Fitting our training dataset with it
FinalLog.fit(X_train,y_train)
LogisticRegression(C=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(C=10)
#Predicting our test dataset with it
TunedLog=FinalLog.predict(X_test)
#printing the confusion matrix
cm=confusion_matrix(y_test,TunedLog)
#Confusion Matrix display
cm=ConfusionMatrixDisplay(cm)
cm.plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x172c6f3b580>
#printing classification report
print (classification_report(y_test,TunedLog))
precision recall f1-score support
0 0.89 0.53 0.67 45
1 0.78 0.96 0.86 78
accuracy 0.80 123
macro avg 0.84 0.75 0.76 123
weighted avg 0.82 0.80 0.79 123
#printing Accuracy Score
print( f"the accuracy score for test data set is {accuracy_score(y_test,TunedLog)}")
the accuracy score for test data set is 0.8048780487804879
# Finding our detailed Accuracy Score of both training and data
TrainedLog=FinalLog.predict(X_train)
#printing classification report with of Training Data
print (classification_report(y_train,TrainedLog))
precision recall f1-score support
0 0.92 0.73 0.81 344
1 0.77 0.93 0.85 344
accuracy 0.83 688
macro avg 0.84 0.83 0.83 688
weighted avg 0.84 0.83 0.83 688
#printing Accuracy Score
print( f"the accuracy score for train data set is {accuracy_score(y_train,TrainedLog)}")
the accuracy score for train data set is 0.8299418604651163
plt.scatter(x=range(len(X_train)),y=y_train,label='Training Actual')
plt.scatter(x=range(len(X_train)),y=TrainedLog,cmap="red",label='Training Prediction')
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Training Set Prediction')
plt.legend()
plt.show()
plt.scatter(x=range(len(X_test)),y=y_test,label='Testing Actual')
plt.scatter(x=range(len(X_test)),y=TunedLog,cmap="red",label='Testing Prediction')
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Testing Set Prediction')
plt.legend()
plt.show()
#Specifying the Grid
param_grid = {'max_depth': [3,4,5 ],'min_samples_split': [10, 15, 20],'min_samples_leaf': [5, 10, 15],'max_features':['auto','sqrt','log2']}
#Setting up the Grid Search
D1=GridSearchCV(DecisionTreeClassifier(),param_grid,cv=5,scoring='accuracy')
#Fitting our training Data Set
D1.fit(X_train,y_train)
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
param_grid={'max_depth': [3, 4, 5],
'max_features': ['auto', 'sqrt', 'log2'],
'min_samples_leaf': [5, 10, 15],
'min_samples_split': [10, 15, 20]},
scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
param_grid={'max_depth': [3, 4, 5],
'max_features': ['auto', 'sqrt', 'log2'],
'min_samples_leaf': [5, 10, 15],
'min_samples_split': [10, 15, 20]},
scoring='accuracy')DecisionTreeClassifier()
DecisionTreeClassifier()
#Finding the Best parameter grid combination
D1.best_params_
{'max_depth': 5,
'max_features': 'sqrt',
'min_samples_leaf': 10,
'min_samples_split': 15}
#Fitting the best Params in D1
D1=DecisionTreeClassifier(**D1.best_params_)
#Fitting the training DataSet
D1.fit(X_train,y_train)
DecisionTreeClassifier(max_depth=5, max_features='sqrt', min_samples_leaf=10,
min_samples_split=15)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. DecisionTreeClassifier(max_depth=5, max_features='sqrt', min_samples_leaf=10,
min_samples_split=15)#Predicting our test Data Set
D1Pred=D1.predict(X_test)
#Confusion Matrix
cm=confusion_matrix(y_test,D1Pred)
#Displaying the Confusion Matrix
CM=ConfusionMatrixDisplay(cm)
CM.plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x172c8d8bac0>
#Printing the classification Report
print (classification_report(y_test,D1Pred))
precision recall f1-score support
0 0.72 0.58 0.64 45
1 0.78 0.87 0.82 78
accuracy 0.76 123
macro avg 0.75 0.72 0.73 123
weighted avg 0.76 0.76 0.76 123
print (f"The accuracy score for test Dataset is {accuracy_score(y_test,D1Pred)} for Decision Tree")
The accuracy score for test Dataset is 0.7642276422764228 for Decision Tree
fig = plt.figure(figsize=(50,30))
_ = tree.plot_tree(D1, feature_names=list(X.columns),class_names=list(Bootcamp['Loan_Status']),filled=True,fontsize=30)
#Checking Accuracy score of the Decision Tree
D1trainpred=D1.predict(X_train)
print(accuracy_score(y_train,D1trainpred))
print(f"The accuracy score of Decision Tree is {accuracy_score(y_train,D1trainpred)}")
0.7906976744186046 The accuracy score of Decision Tree is 0.7906976744186046
plt.scatter(x=range(len(X_train)),y=y_train,label='Training Actual')
plt.scatter(x=range(len(X_train)),y=D1trainpred,cmap="red",label='Training Prediction')
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Training Set Prediction')
plt.legend()
plt.show()
plt.scatter(x=range(len(X_test)),y=y_test,label='Testing Actual')
plt.scatter(x=range(len(X_test)),y=D1Pred,cmap="red",label='Testing Prediction')
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Testing Set Prediction')
plt.legend()
plt.show()
#Specifying the Grid Search
base_classifier=DecisionTreeClassifier()
#Defining Parameter Grid
param_grid={'base_estimator':[base_classifier],'n_estimators':[50, 100, 150],'max_samples':[0.3,0.5,1],'max_features':[0.5,0.8,1]}
#Initiating Grid Search
Bag=GridSearchCV(BaggingClassifier(),param_grid,cv=5,scoring="accuracy")
#Feeding the training Data
Bag.fit(X_train,y_train)
GridSearchCV(cv=5, estimator=BaggingClassifier(),
param_grid={'base_estimator': [DecisionTreeClassifier()],
'max_features': [0.5, 0.8, 1],
'max_samples': [0.3, 0.5, 1],
'n_estimators': [50, 100, 150]},
scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5, estimator=BaggingClassifier(),
param_grid={'base_estimator': [DecisionTreeClassifier()],
'max_features': [0.5, 0.8, 1],
'max_samples': [0.3, 0.5, 1],
'n_estimators': [50, 100, 150]},
scoring='accuracy')BaggingClassifier()
BaggingClassifier()
#Finding the best p0arameter
param=Bag.best_params_
#Fiting the best Parameter in the Dataset and making our
#Bagger again
Bag=BaggingClassifier(**param)
#Fitting our X train and Y train values
Bag.fit(X_train,y_train)
BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_features=0.5,
max_samples=0.3, n_estimators=100)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_features=0.5,
max_samples=0.3, n_estimators=100)DecisionTreeClassifier()
DecisionTreeClassifier()
#Predicting our test variable outcome
Bag1=Bag.predict(X_test)
#Printing the confusion Matrix
cm=confusion_matrix(y_test,Bag1)
CM=ConfusionMatrixDisplay(cm)
CM.plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x172cb987d60>
#Printing our Classification Report of our Decision Tree
print (classification_report(y_test,Bag1))
precision recall f1-score support
0 0.81 0.49 0.61 45
1 0.76 0.94 0.84 78
accuracy 0.77 123
macro avg 0.79 0.71 0.73 123
weighted avg 0.78 0.77 0.76 123
print (f"The Accuracy score of Bagging test is {accuracy_score(y_test,Bag1)}")
The Accuracy score of Bagging test is 0.7723577235772358
#Predicting our Training Variable Outcome
Bag2=Bag.predict(X_train)
#Printing our Accuracy score for Training Variable
print (f"The Accuracy score of Bagging train is {accuracy_score(y_train,Bag2)}")
The Accuracy score of Bagging train is 0.9505813953488372
plt.scatter(x=range(len(X_train)),y=y_train,label='Training Actual')
plt.scatter(x=range(len(X_train)),y=Bag2,cmap="red",label='Training Prediction')
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Training Set Prediction')
plt.legend()
plt.show()
plt.scatter(x=range(len(X_test)),y=y_test,label='Testing Actual')
plt.scatter(x=range(len(X_test)),y=Bag1,cmap="red",label='Testing Prediction')
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Testing Set Prediction')
plt.legend()
plt.show()
#Setting the parameter Grid
param_grid={"n_estimators":[64,100,128],"max_depth":[None,10,20],"min_samples_split":[2,5,10],"min_samples_leaf":[1,2,4],"max_features":['auto','sqrt','log2']}
#Setting up the Grid Search
Rf=GridSearchCV(RandomForestClassifier(),param_grid,cv=5,scoring="accuracy")
#Fiting our grid with X train and Y train
Rf.fit(X_train,y_train)
GridSearchCV(cv=5, estimator=RandomForestClassifier(),
param_grid={'max_depth': [None, 10, 20],
'max_features': ['auto', 'sqrt', 'log2'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10],
'n_estimators': [64, 100, 128]},
scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5, estimator=RandomForestClassifier(),
param_grid={'max_depth': [None, 10, 20],
'max_features': ['auto', 'sqrt', 'log2'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10],
'n_estimators': [64, 100, 128]},
scoring='accuracy')RandomForestClassifier()
RandomForestClassifier()
#Finding the best parameters
paramrf=Rf.best_params_
#Loading up the Random Forest classifier with best parameters fitted
Randf=RandomForestClassifier(**paramrf)
#Fitting the classifier with X train and Y train
Randf.fit(X_train,y_train)
RandomForestClassifier(max_depth=20, min_samples_leaf=2, min_samples_split=5,
n_estimators=64)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomForestClassifier(max_depth=20, min_samples_leaf=2, min_samples_split=5,
n_estimators=64)Randfpred=Randf.predict(X_test)
Crf=confusion_matrix(y_test,Randfpred)
CM=ConfusionMatrixDisplay(Crf)
CM.plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x172c70e4910>
#printing the classification Report
print (classification_report(y_test,Randfpred))
precision recall f1-score support
0 0.81 0.49 0.61 45
1 0.76 0.94 0.84 78
accuracy 0.77 123
macro avg 0.79 0.71 0.73 123
weighted avg 0.78 0.77 0.76 123
print (f"The accuracy score of Random Forest Classification on test Data is {accuracy_score(y_test,Randfpred)}")
The accuracy score of Random Forest Classification on test Data is 0.7723577235772358
#Finding the accuracy score with training data prediction
Rftrain=Randf.predict(X_train)
print (f"The accuracy score of Random Forest Classification on training Data is {accuracy_score(y_train,Rftrain)}")
The accuracy score of Random Forest Classification on training Data is 0.940406976744186
plt.scatter(x=range(len(X_train)),y=y_train,label='Training Actual')
plt.scatter(x=range(len(X_train)),y=Rftrain,cmap="red",label='Training Prediction')
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Training Set Prediction')
plt.legend()
plt.show()
plt.scatter(x=range(len(X_test)),y=y_test,label='Testing Actual')
plt.scatter(x=range(len(X_test)),y=Randfpred,cmap="red",label='Testing Prediction')
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Testing Set Prediction')
plt.legend()
plt.show()
#Setting up the parameter grid
param_grid={'learning_rate':[0.01,0.1,0.3],'n_estimators':[64,100,128],'max_depth':[3,5,7],'subsample':[0.8,1],'colsample_bytree':[0,8,1],'reg_alpha':[0,0.1,0,0.5],'reg_lambda':[0,0.1,0.5]}
#Setting up the Grid Search
Xtreme=GridSearchCV(xgb.XGBClassifier(),param_grid,cv=5,scoring="accuracy")
#Feeeding with our training Data
Xtreme.fit(X_train,y_train)
GridSearchCV(cv=5,
estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None, colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
feature_types=None, gamma=None,
gpu_id=None, grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=None,...
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None,
random_state=None, ...),
param_grid={'colsample_bytree': [0, 8, 1],
'learning_rate': [0.01, 0.1, 0.3],
'max_depth': [3, 5, 7], 'n_estimators': [64, 100, 128],
'reg_alpha': [0, 0.1, 0, 0.5],
'reg_lambda': [0, 0.1, 0.5], 'subsample': [0.8, 1]},
scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5,
estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None, colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
feature_types=None, gamma=None,
gpu_id=None, grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=None,...
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None,
random_state=None, ...),
param_grid={'colsample_bytree': [0, 8, 1],
'learning_rate': [0.01, 0.1, 0.3],
'max_depth': [3, 5, 7], 'n_estimators': [64, 100, 128],
'reg_alpha': [0, 0.1, 0, 0.5],
'reg_lambda': [0, 0.1, 0.5], 'subsample': [0.8, 1]},
scoring='accuracy')XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...)###Checking out the best parameters
Xtreme=Xtreme.best_params_
###Making the Gradient Boost with the best parameters rigged
XGB=xgb.XGBClassifier(**Xtreme)
###Fitting our XGB
XGB.fit(X_train,y_train)
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, feature_types=None, gamma=None, gpu_id=None,
grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.1, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=3, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=128, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, feature_types=None, gamma=None, gpu_id=None,
grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.1, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=3, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=128, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...)XGPred=XGB.predict(X_test)
#Printing our Confusion Matrix
CMxgboost=confusion_matrix(y_test,XGPred)
CM=ConfusionMatrixDisplay(CMxgboost)
CM.plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x172c7503d00>
#Classification Report
print (classification_report(y_test,XGPred))
precision recall f1-score support
0 0.78 0.47 0.58 45
1 0.75 0.92 0.83 78
accuracy 0.76 123
macro avg 0.76 0.69 0.71 123
weighted avg 0.76 0.76 0.74 123
# Finding the accuracy score for test data
accuracy_score(y_test,XGPred)
0.7560975609756098
print (f"the accuracy score of XGBoosting test prediction is {accuracy_score(y_test,XGPred)}")
the accuracy score of XGBoosting test prediction is 0.7560975609756098
# Finding the accuracy score of training DataSet
trainingaccuracy=XGB.predict(X_train)
print (f"the accuracy score of XGBoosting training prediction is {accuracy_score(y_train,trainingaccuracy)}")
the accuracy score of XGBoosting training prediction is 0.9302325581395349
plt.scatter(x=range(len(X_train)),y=y_train,label='Training Actual')
plt.scatter(x=range(len(X_train)),y=trainingaccuracy,cmap="red",label='Training Prediction')
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Training Set Prediction')
plt.legend()
plt.show()
plt.scatter(x=range(len(X_test)),y=y_test,label='Testing Actual')
plt.scatter(x=range(len(X_test)),y=XGPred,cmap="red",label='Testing Prediction')
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Testing Set Prediction')
plt.legend()
plt.show()
param_grid={'C':[0.1,1,10],'kernel':['rbf','sigmoid'],'gamma':['scale','auto',0.1,0.01],'degree':[2,3,4]}
#Setting up the Grid Search
Rbf=GridSearchCV(SVC(),param_grid)
#Fitting our Rbf with training data
Rbf.fit(X_train,y_train)
GridSearchCV(estimator=SVC(),
param_grid={'C': [0.1, 1, 10], 'degree': [2, 3, 4],
'gamma': ['scale', 'auto', 0.1, 0.01],
'kernel': ['rbf', 'sigmoid']})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(estimator=SVC(),
param_grid={'C': [0.1, 1, 10], 'degree': [2, 3, 4],
'gamma': ['scale', 'auto', 0.1, 0.01],
'kernel': ['rbf', 'sigmoid']})SVC()
SVC()
#Finding the best parameter
Param=Rbf.best_params_
#Rigging our kernel with the best Parameter
rbf=SVC(C=Param['C'],kernel=Param['kernel'],gamma=Param['gamma'])
#Fitting our SVC with training data
rbf.fit(X_train,y_train)
SVC(C=10, gamma=0.01)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(C=10, gamma=0.01)
#Predict our test dataset
rbfpred=rbf.predict(X_test)
#Printing our Confusion Matrix
CMSVM=confusion_matrix(y_test,rbfpred)
CM=ConfusionMatrixDisplay(CMSVM)
CM.plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x172c74cbe20>
#Printing the classification report
print (classification_report(y_test,rbfpred))
precision recall f1-score support
0 0.75 0.13 0.23 45
1 0.66 0.97 0.79 78
accuracy 0.67 123
macro avg 0.71 0.55 0.51 123
weighted avg 0.69 0.67 0.58 123
#Predicting our training data set accuracy score
print (f"The accuracy score of rbf kernel in SVM of traing data set is {accuracy_score(y_train,rbf.predict(X_train))}")
The accuracy score of rbf kernel in SVM of traing data set is 1.0
#Printing the accuracy score of test data
print (f"The accuracy score of rbf kernel in SVM of test data is {accuracy_score(y_test,rbfpred)}")
The accuracy score of rbf kernel in SVM of test data is 0.6666666666666666
plt.scatter(x=range(len(X_train)),y=y_train,label='Training Actual')
plt.scatter(x=range(len(X_train)),y=rbf.predict(X_train),cmap="red",label='Training Prediction')
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Training Set Prediction')
plt.legend()
plt.show()
plt.scatter(x=range(len(X_test)),y=y_test,label='Testing Actual')
plt.scatter(x=range(len(X_test)),y=rbfpred,cmap="red",label='Testing Prediction')
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Testing Set Prediction')
plt.legend()
plt.show()
Pk=SVC(kernel="poly",degree=5)
#Fitting the train data in it
Pk.fit(X_train,y_train)
SVC(degree=5, kernel='poly')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(degree=5, kernel='poly')
#Finding our test data prediction
Pktest=Pk.predict(X_test)
#Printing our Confusion Matrix
PkSVM=confusion_matrix(y_test,Pktest)
CMPk=ConfusionMatrixDisplay(PkSVM)
CMPk.plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x24409b4a190>
#Printing the classification report
print (classification_report(y_test,Pktest))
precision recall f1-score support
0 0.00 0.00 0.00 45
1 0.63 1.00 0.78 78
accuracy 0.63 123
macro avg 0.32 0.50 0.39 123
weighted avg 0.40 0.63 0.49 123
#Finding the training accuracy score
print (f"The accuracy score of polynomial kernel in SVM of training data set is {accuracy_score(y_train,Pk.predict(X_train))}")
The accuracy score of polynomial kernel in SVM of training data set is 0.5174418604651163
#Finding our test accuracy score
print (f"The accuracy score of polynomial kernel in SVM of test data set is {accuracy_score(y_test,Pktest)}")
The accuracy score of polynomial kernel in SVM of test data set is 0.6341463414634146
plt.scatter(x=range(len(X_train)),y=y_train,label='Training Actual')
plt.scatter(x=range(len(X_train)),y=Pk.predict(X_train),cmap="red",label='Training Prediction')
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Training Set Prediction')
plt.legend()
plt.show()
plt.scatter(x=range(len(X_test)),y=y_test,label='Testing Actual')
plt.scatter(x=range(len(X_test)),y=Pktest,cmap="red",label='Testing Prediction')
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Testing Set Prediction')
plt.legend()
plt.show()
Lk=SVC(kernel='linear')
#Fitting the model with training data
Lk.fit(X_train,y_train)
SVC(kernel='linear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(kernel='linear')
#Predicting our test data
LkPred=Lk.predict(X_test)
#Plotting our confusion Matrix
LkSVM=confusion_matrix(y_test,LkPred)
CMLk=ConfusionMatrixDisplay(LkSVM)
CMLk.plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x2440915e550>
#Printing the classification report
print (classification_report(y_test,LkPred))
precision recall f1-score support
0 0.64 0.56 0.60 45
1 0.76 0.82 0.79 78
accuracy 0.72 123
macro avg 0.70 0.69 0.69 123
weighted avg 0.72 0.72 0.72 123
#Finding the training accuracy score
print (f"The accuracy score of Linear kernel in SVM of training data set is {accuracy_score(y_train,Lk.predict(X_train))}")
The accuracy score of Linear kernel in SVM of training data set is 0.8255813953488372
#Finding our test accuracy score
print (f"The accuracy score of Linear kernel in SVM of test data set is {accuracy_score(y_test,LkPred)}")
The accuracy score of Linear kernel in SVM of test data set is 0.7235772357723578
plt.scatter(x=range(len(X_train)),y=y_train,label='Training Actual')
plt.scatter(x=range(len(X_train)),y=Lk.predict(X_train),cmap="red",label='Training Prediction')
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Training Set Prediction')
plt.legend()
plt.show()
plt.scatter(x=range(len(X_test)),y=y_test,label='Testing Actual')
plt.scatter(x=range(len(X_test)),y=LkPred,cmap="red",label='Testing Prediction')
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Testing Set Prediction')
plt.legend()
plt.show()
from sklearn.model_selection import KFold, cross_val_score
k_folds = KFold(n_splits = 5)
scores = cross_val_score(LogisticRegression(penalty="l2",C=10), X, y, cv = k_folds)
print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))
Cross Validation Scores: [0.80487805 0.75609756 0.81300813 0.83739837 0.81147541] Average CV Score: 0.8045715047314408 Number of CV Scores used in Average: 5
from sklearn.model_selection import KFold, cross_val_score
k_folds = KFold(n_splits = 5)
scores = cross_val_score(BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_features=0.5,
max_samples=0.3, n_estimators=100), X, y, cv = k_folds)
print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))
Cross Validation Scores: [0.78861789 0.73170732 0.78861789 0.84552846 0.77868852] Average CV Score: 0.7866320138611222 Number of CV Scores used in Average: 5
from sklearn.model_selection import KFold, cross_val_score
k_folds = KFold(n_splits = 5)
scores = cross_val_score(RandomForestClassifier(max_depth=20, min_samples_leaf=2, min_samples_split=5,
n_estimators=64), X, y, cv = k_folds)
print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))
Cross Validation Scores: [0.80487805 0.74796748 0.79674797 0.84552846 0.81147541] Average CV Score: 0.8013194722111155 Number of CV Scores used in Average: 5
def plot_roc_curve(y_test, y_pred):
"""
plots the roc curve based of the probabilities
"""
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
TunedLog=LogisticRegression(penalty="l2",C=10).fit(X_train,y_train).predict(X_test)
plot_roc_curve(y_test, TunedLog)
print(f'model 1 AUC score: {roc_auc_score(y_test, TunedLog)}')
model 1 AUC score: 0.7474358974358974
def plot_roc_curve(y_test, y_pred):
"""
plots the roc curve based of the probabilities
"""
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
Rfpred=RandomForestClassifier(max_depth=20, min_samples_leaf=2, min_samples_split=5,n_estimators=64).fit(X_train,y_train).predict(X_test)
plot_roc_curve(y_test, Rfpred)
print(f'model 1 AUC score: {roc_auc_score(y_test, Rfpred)}')
model 1 AUC score: 0.7188034188034188
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
features = X
target = y
best_features = SelectKBest(score_func = chi2,k = 'all')
fit = best_features.fit(features,target)
featureScores = pd.DataFrame(data = fit.scores_,index = list(X.columns),columns = ['Chi Squared Score'])
featureScores.sort_values(by = 'Chi Squared Score', ascending = False)
| Chi Squared Score | |
|---|---|
| CoapplicantIncome | 11342.041603 |
| ApplicantIncome | 93.904964 |
| LoanAmount | 39.211545 |
| Credit_History | 26.005877 |
| Property_Area_Semiurban | 7.103093 |
| Loan Term is 480.0 | 5.760693 |
| Property_Area_Rural | 4.410584 |
| Loan Term is 36.0 | 4.395833 |
| Education_Not Graduate | 3.540502 |
| Married_No | 3.355645 |
| Dependents_2 | 1.996446 |
| Married_Yes | 1.534292 |
| Loan Term is 120.0 | 1.364929 |
| Education_Graduate | 0.988390 |
| Loan Term is 60.0 | 0.909953 |
| Property_Area_Urban | 0.783946 |
| Dependents_1 | 0.768400 |
| Loan Term is 360.0 | 0.458657 |
| Loan Term is 12.0 | 0.454976 |
| Dependents_3+ | 0.384200 |
| Loan Term is 300.0 | 0.312800 |
| Loan Term is 180.0 | 0.162871 |
| Gender_Female | 0.162407 |
| Gender_Male | 0.080689 |
| Loan Term is 84.0 | 0.073176 |
| Loan Term is 240.0 | 0.073176 |
| Dependents_0 | 0.010509 |
| Self_Employed_Yes | 0.007285 |
| Self_Employed_No | 0.003910 |
importances=DecisionTreeClassifier(max_depth=5, max_features='sqrt', min_samples_leaf=10,
min_samples_split=15).fit(X_train,y_train).feature_importances_
# Sort the feature importance in descending order
#
sorted_indices = np.argsort(importances)[::-1]
feat_labels = X.columns
for f in range(X_train.shape[1]):
print("%2d) %-*s %f" % (f + 1, 30,
feat_labels[sorted_indices[f]],
importances[sorted_indices[f]]))
1) Credit_History 0.779137 2) ApplicantIncome 0.116634 3) CoapplicantIncome 0.054008 4) Dependents_2 0.023833 5) Property_Area_Semiurban 0.012455 6) Dependents_0 0.010358 7) Loan Term is 180.0 0.002683 8) Married_No 0.000506 9) Property_Area_Rural 0.000386 10) Dependents_3+ 0.000000 11) Education_Graduate 0.000000 12) Self_Employed_No 0.000000 13) Dependents_1 0.000000 14) Married_Yes 0.000000 15) Education_Not Graduate 0.000000 16) Gender_Male 0.000000 17) LoanAmount 0.000000 18) Property_Area_Urban 0.000000 19) Self_Employed_Yes 0.000000 20) Loan Term is 480.0 0.000000 21) Loan Term is 12.0 0.000000 22) Loan Term is 36.0 0.000000 23) Loan Term is 60.0 0.000000 24) Loan Term is 84.0 0.000000 25) Loan Term is 120.0 0.000000 26) Loan Term is 240.0 0.000000 27) Loan Term is 300.0 0.000000 28) Loan Term is 360.0 0.000000 29) Gender_Female 0.000000
plt.title('Feature Importance')
plt.bar(range(X_train.shape[1]), importances[sorted_indices], align='center')
plt.xticks(range(X_train.shape[1]), X_train.columns[sorted_indices], rotation=90)
plt.tight_layout()
plt.show()
importances=RandomForestClassifier(max_depth=20, min_samples_leaf=2, min_samples_split=5,n_estimators=64).fit(X_train,y_train).feature_importances_
# Sort the feature importance in descending order
#
sorted_indices = np.argsort(importances)[::-1]
feat_labels = X.columns
for f in range(X_train.shape[1]):
print("%2d) %-*s %f" % (f + 1, 30,
feat_labels[sorted_indices[f]],
importances[sorted_indices[f]]))
1) Credit_History 0.310599 2) ApplicantIncome 0.101718 3) LoanAmount 0.095673 4) Property_Area_Semiurban 0.083143 5) CoapplicantIncome 0.064988 6) Education_Graduate 0.043438 7) Dependents_2 0.032332 8) Dependents_0 0.031539 9) Loan Term is 360.0 0.027836 10) Property_Area_Urban 0.025686 11) Property_Area_Rural 0.023188 12) Married_No 0.023145 13) Gender_Male 0.021388 14) Married_Yes 0.021017 15) Education_Not Graduate 0.018366 16) Dependents_1 0.015824 17) Gender_Female 0.015306 18) Self_Employed_Yes 0.010136 19) Dependents_3+ 0.010078 20) Self_Employed_No 0.009982 21) Loan Term is 180.0 0.006801 22) Loan Term is 480.0 0.003267 23) Loan Term is 300.0 0.002897 24) Loan Term is 36.0 0.000549 25) Loan Term is 240.0 0.000381 26) Loan Term is 60.0 0.000328 27) Loan Term is 120.0 0.000258 28) Loan Term is 84.0 0.000136 29) Loan Term is 12.0 0.000000
plt.title('Feature Importance')
plt.bar(range(X_train.shape[1]), importances[sorted_indices], align='center')
plt.xticks(range(X_train.shape[1]), X_train.columns[sorted_indices], rotation=90)
plt.tight_layout()
plt.show()